
/*
Replication dofile for: "Do community water sources provide safe drinking water? Evidence from a randomized experiment in rural Bangladesh"

This version: 02-2021

Notes: 
- this dofile cleans data and constructs variables used in analysis.  
- you may also need to install additional STATA packages in order to run this code.  
- You need to replace the home directory with the location of the Replication Folder before running the code.  
*/

clear all
set more off
set maxvar 30000
set matsize 10000
eststo clear

*cd .. (replace your own file path here and uncomment to navigation to Replication Folder) 

global data "data"
global dofiles "dofiles"
global path_tables "Tables"
global path_graphs "Graphs"

/// DEFINE PROGRAM TO CORRECT BACTERIA CONTAMINATION DATA

*summarize rates of fraction positive by water source type, round of data collection and and lag length
* obtain globals to be used in the corrections below

use "$data/bac_entry_lags.dta", clear

foreach lag in 0 1 2 3 4 5plus {
	forvalues i = 0(1)1 {
		forvalues j = 0(1)1 {
			quiet sum enterbac_fc if bacteria_entry_lag_`lag' == 1 & baseline == `i' & hh_data == `j'
			global F_p_lag`lag'_b`i'_s`j' = r(mean)		
		}	
	}
}	

**** step 2: program to correct HH/WS bacteria test results ***

capture program drop fecal_correction
program fecal_correction
syntax, invar(varname) baseline(integer) household(integer)
	* create dummies
	foreach v of varlist `invar' {
		gen `v'_0 = (`v'<1) if `v'!=.
		gen `v'_1 = (`v'>=1 & `v'<2) if `v'!=.
		gen `v'_2 = (`v'>=2 & `v'<3) if `v'!=.
		gen `v'_3 = (`v'>=3 & `v'<4) if `v'!=.
		gen `v'_4 = (`v'>=4 & `v'<5) if `v'!=.
		gen `v'_5plus = (`v'>=5) if `v'!=.
	}
	* if samples have missing data for lag length, assume 2 days for that sample
	replace `invar'_2 = 1 if `invar' == . & enterbac_fc_temp != .
	* use correction factors calculated above
	gen f_p = .
	foreach lag in 0 1 2 3 4 5plus {
		replace f_p = ${F_p_lag`lag'_b`baseline'_s`household'} if `invar'_`lag' == 1
	}
	* enter sensitivity and specificity from paper estimates, using best fit curves
	gen sensitivity = .
	gen specificity = .
	foreach lag in 0 1 2 3 4 {
		replace sensitivity = 0.1607 * ln(`lag')+0.7293 if `invar'_`lag' == 1
		replace specificity = -0.1392 * ln(`lag') + 0.9974  if `invar'_`lag' == 1
	}
	local lag = 5
	replace sensitivity = 0.1607 * ln(`lag')+0.7293 if `invar'_5plus == 1
	replace specificity =-0.1392 * ln(`lag') + 0.9974 if `invar'_5plus == 1
	* calculate variable corresponding to expected value of contamination conditional on test result
	gen f_c = min((f_p - 1 + specificity)/(specificity + sensitivity - 1),1) if f_p != .
	gen enterbac_fc_exp_temp = .
	replace enterbac_fc_exp_temp = enterbac_fc_temp * ((sensitivity * f_c)/f_p) + (1 - enterbac_fc_temp) * (((1-sensitivity) * f_c)/(1-f_p))
	replace enterbac_fc_exp_temp  = (sensitivity * f_c)/f_p if enterbac_fc_temp == 1 
	drop enterbac_fc_temp
end

use "data/data_anonymized.dta", clear

**************************
**** general cleaning baseline ****
**************************

* variables for FE
encode union_code_correct, gen(union_code_num)
encode village_code_correct, gen(village_code_num)

* union/TU dummies
quiet tabulate union_code_correct, generate(u_)
quiet tabulate village_code_correct, generate(v_)

* treated
levelsof fu_treatment_status, l(statuses)
foreach status of local statuses{
	gen `status' = (fu_treatment_status=="`status'") if fu_treatment_status!=""
}

gen treated=(fu_treatment_status!="control") if fu_treatment_status!=""

* time to collect water, in mins
destring water_source_queue* dist_min_ws*, replace

* fix outliers

tab dist_min_ws1  // ok for water_source_min2 and water_source_min3
tab hh_distance_final_ws1 if dist_min_ws1>=100 & dist_min_ws1!=. 
replace dist_min_ws1 = 1 if hh_distance_final_ws1<25 & hh_distance_final_ws1!=. & dist_min_ws1>=100 & dist_min_ws1!=. // assume that 23 m are walked in 1 minute
replace dist_min_ws1 = 2 if hh_distance_final_ws1<35 & hh_distance_final_ws1!=. & dist_min_ws1>=100 & dist_min_ws1!=. // assume that 33 m are walked in 2 minutes
replace dist_min_ws1 = 3 if hh_distance_final_ws1>45 & hh_distance_final_ws1!=. & dist_min_ws1>=100 & dist_min_ws1!=. // assume that 45 m are walked in 3 mins
foreach i of numlist 1(1)3 {
	gen collect_water_mins`i' = dist_min_ws`i'*2 + water_source_queue`i' + 0.5 // "Time needed to collect water (mins)" : round trip, queueing time and 30 seconds to get the water.
}
replace collect_water_mins = collect_water_mins1 if cclpg_sample==1 // used in the analysis: only primary source


**********************************************
**** asset index and self reported income ****
**********************************************

* follow: https://www.stata.com/statalist/archive/2013-09/msg00668.html

**All vars are at endline 
global hh_assets rooms hh_asset_bed_quantity hh_asset_sofa_quantity hh_asset_chair_quantity hh_asset_table_quantity hh_asset_wardrobe_qty 	hh_asset_watch_quantity hh_asset_fridge_quantity hh_asset_fan_quantity hh_asset_generator_qty hh_asset_powerloom_qty hh_asset_jewelry_value 	hh_asset_radio_quantity hh_asset_tv_quantity hh_asset_mobile_qty hh_asset_mob_int_qty hh_asset_pc_quantity hh_asset_bicycle_quantity 	hh_asset_motorbike_qty hh_asset_ancart_qty hh_asset_car_quantity hh_asset_boat_quantity hh_asset_rickshaw_qty hh_livestock_draftc_qty hh_livestock_dairyc_qty hh_livestock_goat_qty hh_livestock_chicke_qty own_land_quantity own_agriland_quantity
pca $hh_assets
predict hh_assets_comp1
xtile hh_assets_cat = hh_assets_comp1, nq(5)
gen hh_asset_topquintile = (hh_assets_cat==5) if hh_assets_cat!=.
label var hh_asset_topquintile "Wealthy households (top-20\% HH assets index)"

label var poverty_score_2usd "Poverty index"

gen self_middle_upper =(income_self_assessment=="middle" | income_self_assessment=="upper" ) if income_self_assessment!=""
gen self_upper =(income_self_assessment=="upper" ) if income_self_assessment!=""
gen self_poor =(income_self_assessment=="poor" | income_self_assessment=="very_poor") if income_self_assessment!="" 
label var self_upper "Upper income HH"
label var self_poor "Bottom income HH"

*************************************
**** correct fecal contamination endline ****
*************************************

* preliminary cleaning
gen fu_date_bacteria_entry = fu_hh_bac_test_scanned
gen fu_bacteria_entry_lag = ( fu_date_bacteria_entry - fu_date_today ) // in days

* run program for correction of HH data
gen enterbac_fc_temp = fu_hh_enterbac_fc if cclpg_sample==1
fecal_correction, invar(fu_bacteria_entry_lag) baseline(0) household(1)
gen fu_hh_enterbac_fc_exp = enterbac_fc_exp_temp if cclpg_sample==1
drop f_p sensitivity specificity f_c enterbac_fc_exp_temp

* run program for correction of WS data
foreach i of numlist 1(1)4 {
	gen enterbac_fc_temp = fu_enterbac_fc_ws`i' if cclpg_sample==1
	fecal_correction, invar(fu_ws_bacteria_entry_lag_`i') baseline(0) household(0)
	gen fu_enterbac_fc_exp_ws`i' = enterbac_fc_exp_temp  if cclpg_sample==1
	drop f_p sensitivity specificity f_c enterbac_fc_exp_temp
}

*************************************
**** correct fecal contamination baseline ****
*************************************

* preliminary cleaning
rename date_final date_today
gen date_bacteria_entry =dofc(datetime_bacteria)
format date_today date_bacteria_entry %td
gen bacteria_entry_lag = ( date_bacteria_entry - date_today ) // in days
*
foreach i of numlist 1(1)3 {
	replace enterbac_end_ws`i'=subinstr(enterbac_end_ws`i', "gen", "jan", .)
	replace enterbac_end_ws`i'=subinstr(enterbac_end_ws`i', "mag", "may", .)
	replace enterbac_end_ws`i'=subinstr(enterbac_end_ws`i', "giu", "jun", .)
	replace enterbac_end_ws`i'=subinstr(enterbac_end_ws`i', "lug", "jul", .)
	replace enterbac_end_ws`i'=subinstr(enterbac_end_ws`i', "ago", "aug", .)
	replace enterbac_end_ws`i'=subinstr(enterbac_end_ws`i', "set", "sep", .)
	replace enterbac_end_ws`i'=subinstr(enterbac_end_ws`i', "ott", "oct", .)
	replace enterbac_end_ws`i'=subinstr(enterbac_end_ws`i', "dic", "dec", .)
	replace enterbac_end_ws`i'=subinstr(enterbac_end_ws`i', ".", ":", .)
	gen date_bacteria_entry_ws`i' = date(enterbac_end_ws`i', "DMY hms") 
	format date_bacteria_entry_ws`i' %td
}
foreach i of numlist 1(1)3 {
	gen ws_bacteria_entry_lag_`i' =(date_bacteria_entry_ws`i' - date_today_ws`i')
}

* run program for correction of HH data
gen enterbac_fc_temp = hh_enterbac_fc
fecal_correction, invar(bacteria_entry_lag) baseline(1) household(1)
replace hh_enterbac_fc_exp = enterbac_fc_exp_temp if cclpg_sample==1
drop f_p sensitivity specificity f_c enterbac_fc_exp_temp

* run program for correction of WS data
foreach i of numlist 1(1)3 {
	gen enterbac_fc_temp = enterbac_fc_ws`i'
	fecal_correction, invar(ws_bacteria_entry_lag_`i') baseline(1) household(0)
	if `i'==1 replace enterbac_fc_exp_ws`i' = enterbac_fc_exp_temp if cclpg_sample==1
	if `i'!=1 gen enterbac_fc_exp_ws`i' = enterbac_fc_exp_temp if cclpg_sample==1
	drop f_p sensitivity specificity f_c enterbac_fc_exp_temp
}

*****************************************
*** create variables for analysis *******
*****************************************

* rename
foreach i of numlist 1(1)3 {
	rename water_source_times`i' water_source_times_week_`i'
	gen water_source_times_day_`i' = water_source_times_week_`i'/7
	rename water_source_id`i' source_id`i'
	rename water_source_amount`i' water_source_amount_`i'
}
rename hh_arsenic_field_res hh_arsenic_field_test_result

* share of water collected from each WS
destring water_source_times* water_source_amount*, replace 
foreach n of numlist 1(1)3 {
	gen water_collected_per_day_ws`n' = (water_source_times_day_`n'/7) * water_source_amount_`n'
	* fix outliers: max allowed is 100L per person per day; censor if above
	replace water_collected_per_day_ws`n' = num_hh_m*100 if water_collected_per_day_ws`n'/num_hh_m > 100 & water_collected_per_day_ws`n' !=. // same censoring also at FU
}
egen water_collected_per_day_tot = rowtotal(water_collected_per_day_ws*)
foreach n of numlist 1(1)3 {
	gen water_per_day_share_ws`n' = water_collected_per_day_ws`n'/water_collected_per_day_tot
}
label var water_per_day_share_ws1 "Share of water collected from primary WS"

* calculate vars for WS, weighted by the share of water collected from each WS -endline
foreach i of numlist 1(1)4 {
	rename fu_ws_treat_dummy_ws`i'  fu_treat_d_ws`i' 
	if `i'!=4 rename ws_treat_dummy_ws`i'  treat_d_ws`i' 
}

foreach i of numlist 1(1)4 {
	destring  fu_dist_min_ws`i', replace
	gen fu_arsenic_res_w_ws`i' = fu_ws_arsenic_field_test_res_`i' * fu_water_per_day_share_ws`i'
	foreach v of newlist fu_arsenic_50plus fu_arsenic_any fu_hh_distance_final fu_enterbac_fc fu_enterbac_fc_exp fu_dist_min fu_treat_d {
		capture gen `v'_w_ws`i' = `v'_ws`i' * fu_water_per_day_share_ws`i'
	}
}

foreach v of newlist arsenic_50plus arsenic_any hh_distance_final enterbac_fc enterbac_fc_exp dist_min  treat_d arsenic_res {
	if "`v'"!="treat_d" {
		egen fu_ws_`v'_wmean1 = rowtotal(fu_`v'_w_ws*) if fu_`v'_w_ws1!=. 
		egen fu_ws_`v'_wmean2 = rowtotal(fu_`v'_w_ws*) if fu_`v'_w_ws1!=. & fu_`v'_w_ws2 !=. & fu_num_water_source==2
		egen fu_ws_`v'_wmean3 = rowtotal(fu_`v'_w_ws*) if fu_`v'_w_ws1!=. & fu_`v'_w_ws2 !=. & fu_`v'_w_ws3 !=. & fu_num_water_source==3
		egen fu_ws_`v'_wmean4 = rowtotal(fu_`v'_w_ws*) if fu_`v'_w_ws1!=. & fu_`v'_w_ws2 !=. & fu_`v'_w_ws3 !=. & fu_`v'_w_ws4 !=. & fu_num_water_source==4
		gen fu_ws_`v'_wmean =  fu_ws_`v'_wmean1 if fu_num_water_source==1
			replace fu_ws_`v'_wmean = fu_ws_`v'_wmean2 if fu_num_water_source==2
			replace fu_ws_`v'_wmean = fu_ws_`v'_wmean3 if fu_num_water_source==3
			replace fu_ws_`v'_wmean = fu_ws_`v'_wmean4 if fu_num_water_source==4
		drop fu_ws_`v'_wmean1 fu_ws_`v'_wmean2 fu_ws_`v'_wmean3 fu_ws_`v'_wmean4
	}
	if "`v'"=="treat_d" {
		egen fu_ws_`v'_wmean1 = rowtotal(fu_`v'_w_ws*) if fu_`v'_w_ws1!=. 
		egen fu_ws_`v'_wmean2 = rowtotal(fu_`v'_w_ws*) if fu_`v'_w_ws1!=. & fu_`v'_w_ws2 !=. & fu_num_water_source==2
		egen fu_ws_`v'_wmean3 = rowtotal(fu_`v'_w_ws*) if fu_`v'_w_ws1!=. & fu_`v'_w_ws2 !=. & fu_`v'_w_ws3 !=. & fu_num_water_source==3
		egen fu_ws_`v'_wmean4 = rowtotal(fu_`v'_w_ws*) if fu_`v'_w_ws1!=. & fu_`v'_w_ws2 !=. & fu_`v'_w_ws3 !=. & fu_`v'_w_ws4 !=. & fu_num_water_source==4
		gen fu_ws_`v'_wmean =  fu_ws_`v'_wmean1 if fu_num_water_source==1
			replace fu_ws_`v'_wmean = fu_ws_`v'_wmean2 if fu_num_water_source==2
			replace fu_ws_`v'_wmean = fu_ws_`v'_wmean3 if fu_num_water_source==3
			replace fu_ws_`v'_wmean = fu_ws_`v'_wmean4 if fu_num_water_source==4
		drop fu_ws_`v'_wmean1 fu_ws_`v'_wmean2 fu_ws_`v'_wmean3 fu_ws_`v'_wmean4
	}
}

* installed wells per household 
gen inst_per_hh = constructed_tws/treatment_unit_size
replace inst_per_hh = 0 if constructed_tws == .
label var inst_per_hh "Wells installed per household"

* TU size
gen temp=1
bys union_code_correct village_code_correct: egen tu_size_final=total(temp)
count if tu_size_final!=treatment_unit_size // ??
drop temp

* num of HHs per WS
gen hh_per_ws = tu_size_final / ws_per_tu

* num of WSs per HH
gen ws_per_hh = tu_size_final / ws_per_tu

* num of HHs per TW
gen hh_per_offered_tw = tu_size_final / anchors_required

* amount of water collected per day, in L
foreach i of numlist 1(1)3 {
	gen collect_per_day_qty`i' = 2.5 if water_collected_per_day_cat`i' == "<5 litre"
	replace collect_per_day_qty`i' = 7.5 if water_collected_per_day_cat`i' == "5-10 litre" 
	replace collect_per_day_qty`i' = 15 if water_collected_per_day_cat`i' == "10-20 litre" 
	replace collect_per_day_qty`i' = 35 if water_collected_per_day_cat`i' == "20-50 litre" 
	replace collect_per_day_qty`i' = 75 if water_collected_per_day_cat`i' == "50-100 litre"
	replace collect_per_day_qty`i' = 150 if water_collected_per_day_cat`i' == "100-200 litre"
	replace collect_per_day_qty`i' = 200 if water_collected_per_day_cat`i' == ">200 litre"
}
replace collect_per_day_qty = collect_per_day_qty1 if cclpg_sample==1 

//Generate dummy variables from categorical variables
gen vill_res_trust_v_much = (village_residents_trust == "very_much") if village_residents_trust!="dk" & village_residents_trust!="ra" & village_residents_trust!="" // "don't know" coded as missing (only 5 obs)

gen know_association_ind = (know_association == "yes") if know_association!="ra" & know_association!="r" & know_association!="" // coded "don't know" as a no (since the dummy variable describes knowledge)

gen muslim = (hh_member_religion1=="muslim") if hh_member_religion1!=""


* success rate
rename success success_rate
gen check = constructed_tws/anchors_required
count if check!=success_rate & control==0 // 0, ok
drop check

* successful installation: success_rate (0 if no agreement, no installation, vendor failer, no contributions, no locations)
gen installation_attempt_rate = success_rate
replace installation_attempt_rate = 1 if success_rate==0 & vendor_failed==1
replace installation_attempt_rate = 1 if success_rate==0.5 & vendor_failed==0.5

* inferred value of time from WTP for socially optimal location
gen value_time_h = new_source_soc_wtp/new_source_soc_wtp_time // new_source_soc_wtp is BDT, and new_source_soc_wtp_time is hours
label var value_time_h "Inferred value of time (in hours)"

* generate self-reported income categorical variables
tab income_self_assessment, gen(inc)
forvalues i = 1(1)5 {
	gen inc`i'_t = inc`i' * treated
}
label var inc5_t "Very poor x treated"
label var inc3_t "Poor x treated"
label var inc1_t "Low income x treated"
label var inc2_t "Middle income x treated"
label var inc4_t "High income x treated"

* create vars for arsenic-/bacteria-related health vars
foreach i of numlist 1(1)15 {
	capture rename suffer_diarrhea`i' suffer_diarrhea_`i' 
	capture gen diarrhea_dummy_`i' = (suffer_diarrhea_`i'=="yes") if suffer_diarrhea_`i'!=""
	capture gen fu_diarrhea_dummy_`i' = (fu_suffer_diarrhea_`i'=="yes") if fu_suffer_diarrhea_`i'!=""
}
egen diarrhea_share = rowmean(diarrhea_dummy_*)
egen fu_diarrhea_share = rowmean(fu_diarrhea_dummy_*)


********************
**** label vars ****
********************

label var public_lottery_date "Date of public lottery"
label var labour "Labour"
label var cash "Cash"
label var control "Control"
label var waiver "Waiver"
label var treated "Treated"
*
label var muslim "Muslim household"
label var poverty_score_2usd "Poverty score - 2 USD"
label var new_source_soc_wtp "WTP (cash) - TW in socially optimal location"
label var new_source_soc_wtp_time "WTP (time) - TW in most socially optimal location"
label var no_educ_rate "Not educated HH members (\%)"
label var read_rate "Literacy rate in the household"
label var success_rate "Installation rate"
label var network_nominations "Network nominations"
label var network_size "Network size"
label var vill_res_trust_v_much "High trust towards community"
label var know_association_ind "Know association"
label var anchors_required "Offered TWs"
label var constructed_tws "Installed TWs"
lab var collect_water_mins "Time needed to collect water (mins)" // round trip, queueing time and 30 seconds to get the water.
lab var collect_per_day_qty "Water collected per day (litres)"
label var hh_arsenic_any "Arsenic contamination (WHO) (HH test)"
label var hh_arsenic_50plus "Arsenic contamination (BD) (HH test)"
label var hh_enterbac_fc "Bacteria contamination (HH test)"
label var arsenic_any_ws1 "Arsenic contamination (WHO) (primary WS)"
label var arsenic_50plus_ws1 "Arsenic contamination (BD) (primary WS)"
label var enterbac_fc_ws1 "Bacteria contamination (primary WS)"
label var treat_d_ws1 "Water is treated before drinking (primary WS)"
label var toilet_MICS_dummy "HH has some toilet facility"
label var num_water_source "Number of used water sources"


****************************************************
**** short treatment var names and interactions ****
****************************************************

gen c = cash
gen l = labour
gen w = waiver
gen co = control
rename hh_asset_* hh_a_*
rename vill_res_* v_r_*
rename know_association* know_ass*

* calculate vars for WS, weighted by the share of water collected from each WS
foreach i of numlist 1(1)3 {
	gen arsenic_res_w_ws`i' = arsenic_test_result_ws`i' * water_per_day_share_ws`i'
	foreach v of newlist arsenic_50plus arsenic_any hh_distance_final enterbac_fc enterbac_fc_exp dist_min treat_d {
		gen `v'_w_ws`i' = `v'_ws`i' * water_per_day_share_ws`i'
	}
}

foreach v of newlist arsenic_50plus arsenic_any hh_distance_final enterbac_fc enterbac_fc_exp dist_min treat_d arsenic_res {
	egen ws_`v'_wmean1 = rowtotal(`v'_w_ws*) if `v'_w_ws1!=. 
	egen ws_`v'_wmean2 = rowtotal(`v'_w_ws*) if `v'_w_ws1!=. & `v'_w_ws2 !=. & num_water_source==2
	egen ws_`v'_wmean3 = rowtotal(`v'_w_ws*) if `v'_w_ws1!=. & `v'_w_ws2 !=. & `v'_w_ws3 !=. & num_water_source==3
	gen ws_`v'_wmean =  ws_`v'_wmean1 if num_water_source==1
		replace ws_`v'_wmean = ws_`v'_wmean2 if num_water_source==2
		replace ws_`v'_wmean = ws_`v'_wmean3 if num_water_source==3
	drop ws_`v'_wmean1 ws_`v'_wmean2 ws_`v'_wmean3
}

* gen safe water at baseline and interaction with treatment
gen bas_safe_ws = (ws_arsenic_any_wmean == 0 & ws_enterbac_fc_wmean == 0 ) if ws_arsenic_any_wmean!=. & ws_enterbac_fc_wmean!=.
label var bas_safe_ws "Baseline use of safe WS"
gen t_bas_safe_ws = treated * bas_safe_ws
label var t_bas_safe_ws "Treated * Baseline use of safe WS"

* generate arsenic safety categorical variables
gen as_b_none = ( ws_arsenic_res_wmean < 10)
	label var as_b_none "As safe source (baseline)"
gen as_b_low = ( ws_arsenic_res_wmean >= 10 &  ws_arsenic_res_wmean < 50 )
	label var as_b_low "Low As Source (baseline)"
gen as_b_mid = ( ws_arsenic_res_wmean >= 50 &  ws_arsenic_res_wmean < 100 )
	label var as_b_mid "Moderate As source (baseline)"
gen as_b_high = ( ws_arsenic_res_wmean >= 100 )
	label var as_b_high "High As source (baseline)"
foreach cat in none low mid high {
	gen as_b_`cat'_treated = as_b_`cat' * treated	
}
label var as_b_none_treated "As safe source (baseline) x treated"
label var as_b_low_treated "Low As Source (baseline) x treated"
label var as_b_mid_treated "Moderate As Source (baseline) x treated"
label var as_b_high_treated "High As Source (baseline) x treated"

**********************************
**** create vars for analysis ****
**********************************

* dummies for water storage practices
capture drop fu_source_test_stored
gen fu_source_test_stored=(fu_source_water_test=="stored") if fu_source_water_test!="" & fu_source_water_test!="not_see" // observed
gen water_storage_d = (water_storage=="yes") if water_storage!="" // reported
	gen fu_water_storage_d = (fu_water_storage=="yes") if fu_water_storage!=""
gen water_storage_uncovered_d = (water_storage_covered=="no") if water_storage_covered!="" // reported
	gen fu_water_storage_uncovered_d = (fu_water_storage_covered=="no") if fu_water_storage_covered!=""
gen water_storage_uncovered_dd = (water_storage_covered=="no") if water_storage!="" // reported
	gen fu_water_storage_uncovered_dd = (fu_water_storage_covered=="no") if fu_water_storage!=""
gen water_storage_floor_d = (water_storage_floor=="yes" | water_storage_floor=="some") if water_storage_floor!=""
	gen fu_water_storage_floor_d = (fu_water_storage_floor=="yes" | fu_water_storage_floor=="some") if fu_water_storage_floor!=""
gen water_storage_floor_dd = (water_storage_floor=="yes" | water_storage_floor=="some") if water_storage!=""
	gen fu_water_storage_floor_dd = (fu_water_storage_floor=="yes" | fu_water_storage_floor=="some") if fu_water_storage!=""
gen fu_source_test_uncovered_d = (fu_source_water_test_covered=="no" ) if fu_source_water_test_covered!="" & fu_source_water_test_covered!="not_see" // not asked at baseline
gen fu_source_test_uncovered_dd = (fu_source_water_test_covered=="no" ) if fu_source_water_test!="" & fu_source_water_test!="not_see" & fu_source_water_test_covered!="not_see"
gen fu_source_test_floor_d = (fu_source_water_test_floor=="yes" ) if fu_source_water_test_floor!="" & fu_source_water_test_floor!="not_see" // not asked at baseline
gen fu_source_test_floor_dd = (fu_source_water_test_floor=="yes" ) if fu_source_water_test!="" & fu_source_water_test!="not_see" & fu_source_water_test_floor!="not_see" 
gen fu_source_test_scooped_d = (fu_source_water_test_use=="scoop" ) if fu_source_water_test_use!="" & fu_source_water_test_use!="not_see"
gen fu_source_test_scooped_dd = (fu_source_water_test_use=="scoop" ) if fu_source_water_test!="" & fu_source_water_test!="not_see" & fu_source_water_test_use!="not_see"
foreach v of varlist water_storage water_storage_floor fu_water_storage fu_water_storage_floor {
	order `v'_d , after(`v')
	capture order `v'_dd , after(`v'_d)
}
order water_storage_uncovered_d water_storage_uncovered_dd, after(water_storage_covered)
order fu_water_storage_uncovered_d fu_water_storage_uncovered_dd, after(fu_water_storage_covered)
order fu_source_test_stored - fu_source_test_scooped_dd, after(fu_source_water_test_use)

* arsenic vs lab result
gen fu_arsenic_lab_result_cat = 0 if fu_arsenic_lab_result<10 & fu_arsenic_lab_result!=.
	replace fu_arsenic_lab_result_cat = 10 if fu_arsenic_lab_result>=10 & fu_arsenic_lab_result<20 & fu_arsenic_lab_result!=.
	replace fu_arsenic_lab_result_cat = 25 if fu_arsenic_lab_result>=25 & fu_arsenic_lab_result<50 & fu_arsenic_lab_result!=.
	replace fu_arsenic_lab_result_cat = 50 if fu_arsenic_lab_result>=50 & fu_arsenic_lab_result<100 & fu_arsenic_lab_result!=.
	replace fu_arsenic_lab_result_cat = 100 if fu_arsenic_lab_result>=100 & fu_arsenic_lab_result<200 & fu_arsenic_lab_result!=.
	replace fu_arsenic_lab_result_cat = 200 if fu_arsenic_lab_result>=200 & fu_arsenic_lab_result<500 & fu_arsenic_lab_result!=.
	replace fu_arsenic_lab_result_cat = 500 if fu_arsenic_lab_result>=500 & fu_arsenic_lab_result!=.

* check reported vs measured distances
foreach i of numlist 1(1)4 {
	capture gen fu_diff_dist_ws`i' = fu_hh_distance_final_ws`i' - fu_dist_min_ws`i'*80 // 80 m per minute
	capture gen diff_dist_ws`i' = hh_distance_final_ws`i' - dist_min_ws`i'*80 // 80 m per minute
}


* means of arsenic and bacteria contamination for same HH over used WSs
egen ws_arsenic_any_mean = rowmean(arsenic_any_ws*)
egen ws_arsenic_50plus_mean = rowmean(arsenic_50plus_ws*)
egen ws_bac_mean = rowmean(enterbac_fc_ws*)
egen fu_ws_arsenic_any_mean = rowmean(fu_arsenic_any_ws*)
egen fu_ws_arsenic_50plus_mean = rowmean(fu_arsenic_50plus_ws*)
egen fu_ws_bac_mean = rowmean(fu_enterbac_fc_ws*)

* create vars in differences
foreach v of varlist hh_arsenic_50plus hh_arsenic_any hh_enterbac_fc hh_enterbac_fc_exp  ws_arsenic_any_mean ws_arsenic_50plus_mean ws_bac_mean num_water_source water_per_day_share_ws1 hh_distance_final_ws1 hh_distance_final_ws2 hh_distance_final_ws3 	ws_arsenic_50plus_wmean ws_arsenic_any_wmean ws_enterbac_fc_wmean ws_enterbac_fc_exp_wmean ws_hh_distance_final_wmean source_test_stored water_storage_d water_storage_uncovered_d water_storage_floor_d water_storage_uncovered_dd water_storage_floor_dd 	hh_arsenic_field_test_result ws_arsenic_res_wmean ws_dist_min_wmean  diarrhea_share {
	gen dif_`v' = fu_`v' - `v'
}
foreach v of newlist arsenic_50plus_ws arsenic_any_ws enterbac_fc_ws {
	foreach i of numlist 1(1)4 {
		capture gen dif_`v'`i' = fu_`v'`i' - `v'`i' 
	}
}

********************************
**** checks on final sample ****
********************************

* missing vars
gen hh_arsenic_miss = (hh_arsenic_50plus==.) if consent_given=="yes"
gen hh_bac_miss = (hh_enterbac_fc_exp==.) if consent_given=="yes"
gen fu_hh_arsenic_miss = (fu_hh_arsenic_50plus==.) if fu_consent_given=="yes"
gen fu_hh_bac_miss = (fu_hh_enterbac_fc_exp==.) if fu_consent_given=="yes"
gen hh_test_nonmiss = 1 if hh_arsenic_50plus!=. & hh_enterbac_fc_exp!=. & fu_hh_arsenic_50plus!=. & fu_hh_enterbac_fc_exp!=. ///
	& hh_arsenic_any!=. & fu_hh_arsenic_any!=.
	tab hh_test_nonmiss if consent_given=="yes" & fu_consent_given=="yes"
gen ws_test_nonmiss = 1 if ws_arsenic_50plus_wmean!=. & ws_arsenic_any_wmean!=. & ws_enterbac_fc_exp_wmean!=. & fu_ws_arsenic_50plus_wmean!=. ///
	&  fu_ws_arsenic_any_wmean!=. & fu_ws_enterbac_fc_exp_wmean!=. 
	tab ws_test_nonmiss if consent_given=="yes" & fu_consent_given=="yes"
gen water_test_nonmiss = 1 if hh_test_nonmiss==1 & ws_test_nonmiss==1
*

* missing vars before cleaning
gen hh_arsenic_miss_before = (hh_arsenic_50plus==.) if consent_given=="yes"
gen hh_bac_miss_before = (hh_enterbac_fc==.) if consent_given=="yes"
gen fu_hh_arsenic_miss_before = (fu_hh_arsenic_50plus==.) if fu_consent_given=="yes"
gen fu_hh_bac_miss_before = (fu_hh_enterbac_fc==.) if fu_consent_given=="yes"
gen hh_test_nonmiss_before = 1 if hh_arsenic_50plus!=. & hh_enterbac_fc!=. & fu_hh_arsenic_50plus!=. & fu_hh_enterbac_fc!=. ///
	& hh_arsenic_any!=. & fu_hh_arsenic_any!=.
	tab hh_test_nonmiss_before if consent_given=="yes" & fu_consent_given=="yes"
gen ws_test_nonmiss_before = 1 if ws_arsenic_50plus_wmean!=. & ws_arsenic_any_wmean!=. & ws_enterbac_fc_wmean!=. & fu_ws_arsenic_50plus_wmean!=. ///
	&  fu_ws_arsenic_any_wmean!=. & fu_ws_enterbac_fc_wmean!=. 
	tab ws_test_nonmiss_before if consent_given=="yes" & fu_consent_given=="yes"
gen water_test_nonmiss_before = 1 if hh_test_nonmiss_before==1 & ws_test_nonmiss_before==1


* final complete panel sample
gen complete_panel = 1 if panel_final==1 & dif_hh_arsenic_50plus!=. & dif_hh_enterbac_fc!=. & dif_ws_arsenic_50plus_wmean!=. & dif_ws_enterbac_fc_wmean!=.

label var panel_final "Panel HH: HH is interviewed at baseline and follow-up"
label var complete_panel "Panel HH with complete water quality data"

******************
**** weights *****
******************
* calculate weights for use in hh level regressions
gen hh_n = ustrright(hh_code_correct,6)
replace hh_n = ustrright(hh_code_correct,8) if length(hh_code_correct) == 21
replace hh_n =subinstr(hh_n,"_","0",.)
destring hh_n, replace
by village_code_correct, sort: egen hhs_b = count(hh_n) if consent_given == "yes"
gen pweight_b = 40/hhs_b
replace pweight_b = 1 if mics_sample==1
label var pweight_b "Probability weights for baseline analysis"

* calculate final weights
gen count=1 if complete_panel==1
bys village_code_correct: egen hhs_tot = total(count)
gen pweight_normal_final = 40/hhs_tot
order pweight_normal_final , after(pweight_b)
label var pweight_normal_final "Probability weights for panel analysis"

**HEALTH VARIABLES 

* generate share of diarrhea and share diarrhea under 5yo
forvalues n=1/20 {
	cap confirm variable diarrhea_dummy_`n'
	if _rc==0{
		*5orunder at baseline in hhs
		gen fiveorunder_dummy_`n'=(hh_member_age`n'<=5) if hh_member_age`n'!=.
		*diarrhea under 5 at baseline
		gen diarrhea_fiveorunder_dummy_`n'=diarrhea_dummy_`n'*fiveorunder_dummy_`n' if diarrhea_dummy_`n'!=.
		*arsenic at baseline 
		gen arsenicreported_dummy_`n'=suffer_arsenic_rep`n'=="symptoms" if suffer_arsenic_rep`n'!=""&suffer_arsenic_rep`n'!="dk"&suffer_arsenic_rep`n'!="ra"
		
		gen arsenicreported_num_`n'=(suffer_arsenic_rep`n'!=""&suffer_arsenic_rep`n'!="dk"&suffer_arsenic_rep`n'!="ra")
		
		gen arsenicobserved_dummy_`n'=suffer_arsenic_obs`n'=="symptoms_observed" if suffer_arsenic_obs`n'!=""&suffer_arsenic_obs`n'!="no_symptoms_unconf"
		
		gen arsenicobserved_num_`n'=(suffer_arsenic_obs`n'!=""&suffer_arsenic_obs`n'!="no_symptoms_unconf")

	}
	
	cap confirm variable fu_hh_member_age_`n'
	if _rc==0{
		rename fu_hh_member_age_`n' fu_hh_member_age`n'
		*5orunder at followup in hhs
		gen fu_fiveorunder_dummy_`n'=(fu_hh_member_age`n'<=5) if fu_hh_member_age`n'!=.
		*diarrhea 5or under at fup
		gen fu_diarrhea_fiveorunder_dummy_`n'=fu_diarrhea_dummy_`n'*fu_fiveorunder_dummy_`n' if fu_diarrhea_dummy_`n'!=.
		
		*arsenic at fup 
		gen fu_arsenicreported_dummy_`n'=fu_suffer_arsenic_rep_`n'=="yes" if fu_suffer_arsenic_rep_`n'!=""&fu_suffer_arsenic_rep_`n'!="dk"&fu_suffer_arsenic_rep_`n'!="ra"

		gen fu_arsenicreported_num_`n'=(fu_suffer_arsenic_rep_`n'!=""&fu_suffer_arsenic_rep_`n'!="dk"&fu_suffer_arsenic_rep_`n'!="ra")

		gen fu_arsenicobserved_dummy_`n'=fu_suffer_arsenic_obs_`n'=="symptoms_observed" if fu_suffer_arsenic_obs_`n'!=""&fu_suffer_arsenic_obs_`n'!="no_symptoms_unconf"
		
		gen fu_arsenicobserved_num_`n'=(fu_suffer_arsenic_obs_`n'!=""&fu_suffer_arsenic_obs_`n'!="no_symptoms_unconf")

		*died and 5or under at baseline
		gen died_fiveorunder_dummy_`n'=fiveorunder_dummy_`n'*(fu_hh_member_left_why_`n'=="died") if fiveorunder_dummy_`n'==1
	}
}
		
		*number of hhs member below 5
egen number_fiveorunder=rowtotal(fiveorunder_dummy_*)	
egen fu_number_fiveorunder=rowtotal(fu_fiveorunder_dummy_*)	
	
		*number of hhs member below 5 with diarrhea at baseline 
egen diarrhea_tot_fiveorunder=rowtotal(diarrhea_fiveorunder_dummy_*)	
gen diarrhea_share_fiveorunder=diarrhea_tot_fiveorunder/number_fiveorunder if number_fiveorunder!=0&number_fiveorunder!=.
		*at followup
egen fu_diarrhea_tot_fiveorunder=rowtotal(fu_diarrhea_fiveorunder_dummy_*)	
gen fu_diarrhea_share_fiveorunder=fu_diarrhea_tot_fiveorunder/fu_number_fiveorunder if fu_number_fiveorunder!=0&fu_number_fiveorunder!=.
		*differences
gen dif_diarrhea_share_fiveorunder=fu_diarrhea_share_fiveorunder-diarrhea_share_fiveorunder if diarrhea_share_fiveorunder!=.&fu_diarrhea_share_fiveorunder!=.

		*number under 5 died
egen died_fiveorunder=rowtotal(died_fiveorunder_dummy_*)
gen mortality_fiveorunder=died_fiveorunder/number_fiveorunder if number_fiveorunder!=0&number_fiveorunder!=.
***all zeroes!!

		*suffer from arsenic reported and observed at baseline
egen arsenicreported_tot=rowtotal(arsenicreported_dummy_*)	
egen arsenicreported_totnum=rowtotal(arsenicreported_num_*)	
gen arsenicreported_share=arsenicreported_tot/arsenicreported_totnum if arsenicreported_totnum!=0&arsenicreported_totnum!=.

egen arsenicobserved_tot=rowtotal(arsenicobserved_dummy_*)	
egen arsenicobserved_totnum=rowtotal(arsenicobserved_num_*)	
gen arsenicobserved_share=arsenicobserved_tot/arsenicobserved_totnum if arsenicobserved_totnum!=0&arsenicobserved_totnum!=.

		*at followup
egen fu_arsenicreported_tot=rowtotal(fu_arsenicreported_dummy_*)	
egen fu_arsenicreported_totnum=rowtotal(fu_arsenicreported_num_*)	
gen fu_arsenicreported_share=fu_arsenicreported_tot/fu_arsenicreported_totnum if fu_arsenicreported_totnum!=0&fu_arsenicreported_totnum!=.

egen fu_arsenicobserved_tot=rowtotal(fu_arsenicobserved_dummy_*)	
egen fu_arsenicobserved_totnum=rowtotal(fu_arsenicobserved_num_*)	
gen fu_arsenicobserved_share=fu_arsenicobserved_tot/fu_arsenicobserved_totnum if fu_arsenicobserved_totnum!=0&fu_arsenicobserved_totnum!=.

		*differences
gen dif_arsenicobserved_share=fu_arsenicobserved_share-arsenicobserved_share if arsenicobserved_share!=.&fu_arsenicobserved_share!=.

gen dif_arsenicreported_share=fu_arsenicreported_share-arsenicreported_share if arsenicreported_share!=.&fu_arsenicreported_share!=.

		*differences in died from diarrhea
destring died_diarrhea_number, replace
replace died_diarrhea_number=0 if died_diarrhea=="no"
destring fu_died_diarrhea_number, replace
replace fu_died_diarrhea_number=0 if fu_died_diarrhea=="no"
gen dif_died_diarrhea_hh_num=fu_died_diarrhea_number-died_diarrhea_number if fu_died_diarrhea_number!=.&died_diarrhea_number!=.

		*differences in died from arsenic
destring died_arsenic_number, replace
replace died_arsenic_number=0 if died_arsenic=="no"
destring fu_died_arsenic_number, replace
replace fu_died_arsenic_number=0 if fu_died_arsenic=="no"
gen dif_died_arsenic_hh_num=fu_died_arsenic_number-died_arsenic_number if fu_died_arsenic_number!=.&died_arsenic_number!=.

rename dif_diarrhea_share dif_ds
rename dif_diarrhea_share_fiveorunder dif_ds5
rename diarrhea_share ds
rename diarrhea_share_fiveorunder ds5
rename mortality_fiveorunder dif_mort5
rename dif_died_arsenic_hh_num dif_mort_as

rename dif_arsenicreported_share dif_asrep
rename dif_arsenicobserved_share dif_asobs
rename arsenicreported_share asrep
rename arsenicobserved_share asobs
rename hh_arsenic_field_test_result hh_arsenic_field_res 
rename village_code_correct village_code

* create indicators for minute length of walking distance
foreach stub of newlist desk algo {
	local v = "distance_hh_`stub'anchor"
*	gen `v'_ceil = ceil(`v'/80)
	tab `v'_ceil, gen(hh_d_`stub')
	foreach i of varlist hh_d_`stub'* {
		gen `i'_t = `i' * treated
	}
}

gen ws_hh_dist_final_wmean_mins = ws_hh_distance_final_wmean /80  
gen fu_ws_hh_dist_final_wmean_mins = fu_ws_hh_distance_final_wmean /80  
gen dif_ws_hh_dist_final_wmean_mins = dif_ws_hh_distance_final_wmean /80  

gen log_distance_hh_deskanchor = log(distance_hh_deskanchor)
gen log_distance_hh_algoanchor = log(distance_hh_algoanchor)

foreach t of varlist control cash labour waiver treated {
	gen ldist_deskanchor_`t' = `t' * log_distance_hh_deskanchor
	label var ldist_deskanchor_`t' "Log distance HH (final loc) - closest desk anchor (in m) * `t'"
	gen ldist_algoanchor_`t' = `t' * log_distance_hh_algoanchor
	label var ldist_algoanchor_`t' "Log distance HH (final loc) - closest algo anchor (in m) * `t'"
}

save $data/temp/temp_analysis.dta, replace

**********
/////////////////////////////////////////////////////// WS data at follow-up /////////////////////////////////////////////////////////////////////////////////////////

use "$data/temp/temp_analysis.dta", clear

* keep relevant variables/observations for WS list
keep fu_arsenic_res_w_ws* fu_arsenic_50plus_ws* fu_arsenic_any_ws* fu_enterbac_fc_ws* fu_enterbac_fc_exp_ws* village_code fu_ws_newcode_* fu_ws*_project_tw  
keep if fu_ws_newcode_1!=.


* reshape long: one row = one HH-WS
foreach i of numlist 1(1)4 {
	rename fu_ws`i'_project_tw fu_project_tw`i'
}

gen id=_n

reshape long fu_arsenic_res_w_ws fu_arsenic_50plus_ws fu_arsenic_any_ws fu_enterbac_fc_ws fu_enterbac_fc_exp_ws  fu_ws_newcode_ fu_project_tw , i(id) j(ws_num)


drop if fu_ws_newcode_==.
sort village_code fu_ws_newcode_

* collapse at WS level
collapse fu_arsenic_res_w_ws fu_arsenic_50plus_ws fu_arsenic_any_ws fu_enterbac_fc_ws fu_enterbac_fc_exp_ws fu_project_tw , by(village fu_ws_newcode )
replace fu_project_tw = 1 if fu_project_tw>0 & fu_project_tw<1 // checked browsing data

bys village: egen num_project_tws = total(fu_project_tw)

rename fu_ws_newcode ws_code
* order, rename, label vars
order village ws_code
label var fu_arsenic_50plus_ws "Arsenic contamination (Bangladeshi threshold)"
label var fu_arsenic_any_ws  "Arsenic contamination (Bangladeshi threshold)"
label var fu_enterbac_fc_ws  "Fecal contamination"
label var fu_enterbac_fc_exp_ws  "Fecal contamination"
label var fu_project_tw "Project TW"
label var num_project_tws "Number of project TWs per TU"

save "$data/temp/temp_ws.dta", replace

**********
/////////////////////////////////////////////////////// panel of WS data /////////////////////////////////////////////////////////////////////////////////////////

use "$data/ws_panel_anonymized.dta", clear

* run program for correction of fecal contamination data

gen enterbac_fc_temp = ws_bac_test_result if post==0
fecal_correction, invar(ws_bacteria_entry_lag) baseline(1) household(0)
gen enterbac_fc_exp = enterbac_fc_exp_temp if post==0
drop f_p sensitivity specificity f_c enterbac_fc_exp_temp ws_bacteria_entry_lag_*

gen enterbac_fc_temp = ws_bac_test_result if post==1
fecal_correction, invar(ws_bacteria_entry_lag) baseline(0) household(0)
replace enterbac_fc_exp = enterbac_fc_exp_temp if post==1
drop f_p sensitivity specificity f_c enterbac_fc_exp_temp ws_bacteria_entry_lag_*

save  "$data/temp/temp_ws_panel_anonymized.dta", replace

//the end